# Replication code for Taylor C. Boas, Dino P. Christenson, and David M. Glick, "Recruiting Large Online Samples in the United States and India: Facebook, Mechanical Turk and Qualtrics," Political Science Research and Methods.

# Analysis conducted in R 3.4.3 on MacOS 10.13.2

# NOTE: This file cleans the raw India survey data. Questionnaires corresponding to the raw data and a codebook for the cleaned data can be found in the main folder. Files should be run in the following order; please see readme.txt for details.
# 	1. clean_us_survey.R
# 	2. clean_india_survey.R
# 	3. merge_external_data_us.R
# 	4. merge_external_data_india.R
# 	5. analyze_demographics.R
# 	6. analyze_spaces.R
# 	7. analyze_politics.R
# 	8. analyze_cooperativeness.R
# 	9. analyze_experiments.R

# Set working directory as appropriate
# setwd('~/Dropbox/sample recruitment shared/replication/')

# Clean desktop and load packages. Please make sure all necessary packages are installed.
rm(list=ls(all=T))
library(car)

# Load raw survey data and make copy for cleaning
load('india_raw.RData')
india<-india_raw

# Clean data 
india$StartDate<-as.POSIXct(india$StartDate,tz='America/New_York')
india$EndDate<-as.POSIXct(india$EndDate,tz='America/New_York')
india$StartDate[which(india$EndDate < india$StartDate)]<- india$StartDate[which(india$EndDate < india$StartDate)] - 3600 # Straddled time change, R inaccurately assumed EST because after 01:30
india$EndDate[which(india$EndDate < india$StartDate)]<- india$EndDate[which(india$EndDate < india$StartDate)] + 3600 # Straddled time change, R inaccurately assumed EDT because before 01:30
india$duration<-(as.numeric(india$EndDate)-as.numeric(india$StartDate))/60 # Duration in minutes
india$duration[india$Finished==0]<-NA

names(india)[which(names(india)=='B1_3_TEXT')]<-'OS'

india$Q1<-as.numeric(india$Q1)
india$Q2<-as.numeric(india$Q2)
india$Q3<-as.numeric(india$Q3)+16
india$Q4<-as.numeric(india$Q4)
india$Q5<-as.numeric(india$Q5)
india$Q6<-apply(india[,c('Q6a','Q6b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q6[india$Q6==0]<-NA
india$Q6a<-india$Q6b<-NULL

india$Q7<-apply(india[,c('Q7a','Q7b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q7[india$Q7==0]<-NA
india$Q7a<-india$Q7b<-NULL
india$Q8b<-recode(india$Q8b,'2=1;1=2') # Dealing with party order randomization
india$Q8<-apply(india[,c('Q8a','Q8b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q8[india$Q8==0]<-NA
india$Q8a<-india$Q8b<-NULL
india$Q8_text<-india$Q8a_TEXT
india$Q8_text[nchar(india$Q8_text)==0]<-india$Q8b_TEXT[nchar(india$Q8_text)==0]
india$Q8a_TEXT<-india$Q8b_TEXT<-NULL

india$Q9<-apply(india[,c('Q9a','Q9b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q9[india$Q9==0]<-NA
india$Q9a<-india$Q9b<-NULL
india$Q10b<-recode(india$Q10b,'2=1;1=2') # Dealing with party order randomization
india$Q10<-apply(india[,c('Q10a','Q10b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q10[india$Q10==0]<-NA
india$Q10a<-india$Q10b<-NULL
india$Q10_text<-india$Q10a_TEXT
india$Q10_text[nchar(india$Q10_text)==0]<-india$Q10b_TEXT[nchar(india$Q10_text)==0]
india$Q10a_TEXT<-india$Q10b_TEXT<-NULL

india$Q11<-as.numeric(india$Q11)
india$Q11_firstclick<-as.numeric(india$T11_1)
india$Q11_firstclick[india$Q11_firstclick <= 0]<-NA
india$Q11_lastclick<-as.numeric(india$T11_2)
india$Q11_lastclick[india$Q11_lastclick <= 0]<-NA
india$Q11_submit<-as.numeric(india$T11_3)
india$Q11_submit[india$Q11_submit <= 0]<-NA
india$Q11_clickcount<-as.numeric(india$T11_4)
india$Q11_clickcount[india$Q11_clickcount <= 0]<-NA
india$T11_1<-india$T11_2<-india$T11_3<-india$T11_4<-NULL

india$Q12a<-as.numeric(india$Q12_1)
india$Q12b<-as.numeric(india$Q12_2)
india$Q12c<-as.numeric(india$Q12_3)
india$Q12d<-as.numeric(india$Q12_4)
india$Q12_1<-india$Q12_2<-india$Q12_3<-india$Q12_4<-NULL

india$Q13_correct<-0
india$Q13_correct[grep('modi|narendra|modai|modhi|mody|moodi|mothi|narender mod|narend mood|narendta oti',tolower(india$Q13))]<-1

india$Q14_correct<-0
india$Q14_correct[india$Q24 == '']<-NA # No state declared
india$Q14_correct[india$Q24 %in% c(1,6,8,9)]<-NA # Union territories with no chief minister
india$Q14_correct[india$Q24=='2'][grep('naidu|babu|cbn', tolower(india$Q14[india$Q24=='2']))]<-1
india$Q14_correct[india$Q24=='3'][grep('tuki', tolower(india$Q14[india$Q24=='3']))]<-1
india$Q14_correct[india$Q24=='4'][grep('tarun', tolower(india$Q14[india$Q24=='4']))]<-1
india$Q14_correct[india$Q24=='5'][grep('nitish|kumar', tolower(india$Q14[india$Q24=='5']))]<-1
india$Q14_correct[india$Q24=='7'][grep('raman', tolower(india$Q14[india$Q24=='7']))]<-1
india$Q14_correct[india$Q24=='10'][grep('arvind|kejriwal|gajriwal|kejiriwal|kejri war', tolower(india$Q14[india$Q24=='10']))]<-1
india$Q14_correct[india$Q24=='11'][grep('parsekar', tolower(india$Q14[india$Q24=='11']))]<-1
india$Q14_correct[india$Q24=='12'][grep('anandi|patel', tolower(india$Q14[india$Q24=='12']))]<-1
india$Q14_correct[india$Q24=='13'][grep('manohar|khattar|khatar|khater|khatter', tolower(india$Q14[india$Q24=='13']))]<-1
india$Q14_correct[india$Q24=='14'][grep('bhadra|singh', tolower(india$Q14[india$Q24=='14']))]<-1
india$Q14_correct[india$Q24=='15'][grep('mufti|sayeed|sayed', tolower(india$Q14[india$Q24=='15']))]<-1
india$Q14_correct[india$Q24=='16'][grep('raghubar|das', tolower(india$Q14[india$Q24=='16']))]<-1
india$Q14_correct[india$Q24=='17'][grep('siddara|maiah|siddhara|sidhara|sidara|siddra|sidra|sidhra|sidha|sidda|siddhra', tolower(india$Q14[india$Q24=='17']))]<-1
india$Q14_correct[india$Q24=='17'][grep('siddalingiah', tolower(india$Q14[india$Q24=='17']))]<-0
india$Q14_correct[india$Q24=='18'][grep('chandy|chandi|chaandi', tolower(india$Q14[india$Q24=='18']))]<-1
india$Q14_correct[india$Q24=='20'][grep('shivraj|singh|chouhan|chauhan|chohan', tolower(india$Q14[india$Q24=='20']))]<-1
india$Q14_correct[india$Q24=='21'][grep('devendra|fadnavis|phadnavis|fadanvis|phadanvis|fadvanis|fadavanis|fandnivs|devandra|fadanavis|fadavnis|fandavis|fadanawis|fadavnis', tolower(india$Q14[india$Q24=='21']))]<-1
india$Q14_correct[india$Q24=='22'][grep('ibobi', tolower(india$Q14[india$Q24=='22']))]<-1
india$Q14_correct[india$Q24=='23'][grep('sangma', tolower(india$Q14[india$Q24=='23']))]<-1
india$Q14_correct[india$Q24=='25'][grep('zeilang', tolower(india$Q14[india$Q24=='25']))]<-1
india$Q14_correct[india$Q24=='26'][grep('naveen|patnaik|nabin|navin|pattanayak', tolower(india$Q14[india$Q24=='26']))]<-1
india$Q14_correct[india$Q24=='27'][grep('rangasamy|rangaswamy|rangaswami', tolower(india$Q14[india$Q24=='27']))]<-1
india$Q14_correct[india$Q24=='28'][grep('parkash|badal|baadal', tolower(india$Q14[india$Q24=='28']))]<-1
india$Q14_correct[india$Q24=='29'][grep('vasundhara|vasundhra|vasundra| raje', tolower(india$Q14[india$Q24=='29']))]<-1
india$Q14_correct[india$Q24=='30'][grep('chamling', tolower(india$Q14[india$Q24=='30']))]<-1
india$Q14_correct[india$Q24=='31'][grep('jaya|jeyala|jeya lalitha|jaylalitha|jailalitha|jailatha|jailitha|jayialeta|jeyalitha', tolower(india$Q14[india$Q24=='31']))]<-1
india$Q14_correct[india$Q24=='32'][grep('kcr|shekar|sekhar|sekhar|shekher|rao|k\\.c \\.r|k\\.c\\.r', tolower(india$Q14[india$Q24=='32']))]<-1
india$Q14_correct[india$Q24=='32'][grep('reddy', tolower(india$Q14[india$Q24=='32']))]<-0
india$Q14_correct[india$Q24=='33'][grep('sarkar', tolower(india$Q14[india$Q24=='33']))]<-1
india$Q14_correct[india$Q24=='34'][grep('rawat', tolower(india$Q14[india$Q24=='34']))]<-1
india$Q14_correct[india$Q24=='35'][grep('akhilesh|yadav', tolower(india$Q14[india$Q24=='35']))]<-1
india$Q14_correct[india$Q24=='35'][grep('akhilesh kumar$|akhilesh mishra|akhilesh shukla', tolower(india$Q14[india$Q24=='35']))]<-0
india$Q14_correct[india$Q24=='36'][grep('mamata|banerjee|mamta|bannerjee|bajerjee|banarji', tolower(india$Q14[india$Q24=='36']))]<-1

india$Q15_correct<-0
india$Q15_correct[grep('republic|rebuplic|rebublic|repulic|replublic|republuc|repablic|replublic|republuc|repablic|re public|republc|repulbic|repuplic|rebplic|rebulic|rebulicday|reepablicday|reoublic|repblic|repblican|repbulic|repiblic|repiblic|replablicday|replic day|repliuday|replubic day|repluic day|repub;ic day|repubalic|repubelic|repubilc|repubilc|republi|republik|repubulic|repulblic|repupblic|respublic|reublic|rpublic|rubuplic|ruplican|rupublic|gantantra|gantatra|guntrant divas|prajasatak|prajasattak|prajatantra dibash|prajatrantadibas|prajatantra dibas|projatanto|constitution|constitute|constutution|public day',tolower(india$Q15))]<-1

india$Q16_correct<-0
india$Q16_correct[grep('gandhi|gandi|ghandhi|ghandi|ganthi|gnadhi|ganshi janti|ghanthi seyanthi|birth|jayanti|jayanthi|jyanti|jaynti|jantanti|non\\-violence|non violence|non\\-voilance',tolower(india$Q16))]<-1
india$Q16_correct[grep('death',tolower(india$Q16))]<-0
india$Q16_correct[grep("gandhiji's birthday and kamaraj's death anniversary",tolower(india$Q16))]<-1

india$Q16_firstclick<-as.numeric(india$T16_1)
india$Q16_firstclick[india$Q16_firstclick <= 0]<-NA
india$Q16_lastclick<-as.numeric(india$T16_2)
india$Q16_lastclick[india$Q16_lastclick <= 0]<-NA
india$Q16_submit<-as.numeric(india$T16_3)
india$Q16_submit[india$Q16_submit <= 0]<-NA
india$Q16_clickcount<-as.numeric(india$T16_4)
india$Q16_clickcount[india$Q16_clickcount <= 0]<-NA
india$T16_1<-india$T16_2<-india$T16_3<-india$T16_4<-NULL

india$Q17<-as.numeric(india$Q17)

india$Q18_treat<-NA
india$Q18_treat[india$Q18a==''&india$Q18b!='']<-1
india$Q18_treat[india$Q18b==''&india$Q18a!='']<-0
india$Q18<-apply(india[,c('Q18a','Q18b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q18[india$Q18==0]<-NA
india$Q18a<-india$Q18b<-NULL

india$Q18_firstclick<-apply(india[,c('T18a_1','T18b_1')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q18_firstclick[india$Q18_firstclick<=0]<-NA
india$Q18_lastclick<-apply(india[,c('T18a_2','T18b_2')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q18_lastclick[india$Q18_lastclick<=0]<-NA
india$Q18_submit<-apply(india[,c('T18a_3','T18b_3')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q18_submit[india$Q18_submit<=0]<-NA
india$Q18_clickcount<-apply(india[,c('T18a_4','T18b_4')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q18_clickcount[india$Q18_clickcount<=0]<-NA
india$T18a_1<-india$T18b_1<-india$T18a_2<-india$T18b_2<-india$T18a_3<-india$T18b_3<-india$T18a_4<-india$T18b_4<-NULL

india$Q19_treat<-NA
india$Q19_treat[india$Q19a==''&india$Q19b!='']<-1
india$Q19_treat[india$Q19b==''&india$Q19a!='']<-0
india$Q19<-apply(india[,c('Q19a','Q19b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q19[india$Q19==0]<-NA
india$Q19a<-india$Q19b<-NULL

india$Q19_firstclick<-apply(india[,c('T19a_1','T19b_1')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q19_firstclick[india$Q19_firstclick<=0]<-NA
india$Q19_lastclick<-apply(india[,c('T19a_2','T19b_2')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q19_lastclick[india$Q19_lastclick<=0]<-NA
india$Q19_submit<-apply(india[,c('T19a_3','T19b_3')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q19_submit[india$Q19_submit<=0]<-NA
india$Q19_clickcount<-apply(india[,c('T19a_4','T19b_4')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q19_clickcount[india$Q19_clickcount<=0]<-NA
india$T19a_1<-india$T19b_1<-india$T19a_2<-india$T19b_2<-india$T19a_3<-india$T19b_3<-india$T19a_4<-india$T19b_4<-NULL

india$Q20<-as.numeric(india$Q20)
india$Q21<-as.numeric(india$Q21)
india$Q22<-as.numeric(india$Q22)
india$Q23<-as.numeric(india$Q23)
india$Q24<-as.numeric(india$Q24)

# Leaving Q25, PIN code, as character. 000000 is invalid, others may be as well.

india$Q26<-as.numeric(india$Q26)
india$Q27<-as.numeric(india$Q27)

india$Q28_dk<-india$Q28a==''
india$Q28<-apply(india[,c('Q28a','Q28b')],1,function(x) sum(as.numeric(x),na.rm=T))
india$Q28[india$Q28==0]<-NA
india$Q28a<-india$Q28b<-NULL

india$Q29<-as.numeric(india$Q29)
india$Q30<-as.numeric(india$Q30)

india$Q32<-as.numeric(india$Q32)
india$Q32[india$Q32==-99]<-2 # Coding those who skipped the question as a "no"

# Reorder variables
q_order<-sort(names(india))
q_order<-c(q_order[grep('Q[0-9]{1}_{0,1}[a-z]{0,4}$',q_order)], q_order[grep('Q[0-9]{2}',q_order)])
india<-india[,c(names(india)[!names(india) %in% q_order],q_order)]

# Last question answered and percentage progress. Ignoring questions Q31 (optional open-ended comment box) and Q32 (Facebook raffle entry).
answered<-NA
answered[which(india$Finished==0)]<-apply(india[india $Finished==0,grep('^Q[0-9]{1,2}[a-z]{0,1}$',names(india))],1,function(x) which(!is.na(x)&x!=''))
india$last_answered<-names(india)[grep('^Q[0-9]{1,2}[a-z]{0,1}$',names(india))][sapply(answered,function(x) ifelse(length(x)==0,NA,max(x)))]
india$last_answered[which(india$Finished==1)]<-'Q30'
india$last_answered[india$last_answered %in% c('Q31','Q32')]<-'Q30'
india$pct_progress<-round(100*as.numeric(substr(india$last_answered,2,3))/30)

save(india,file='india.RData')